Data Backfill

The goal of this doc is to execute an initial data pull of the hourly demand for California balancing authority subregion (CISO). This includes the following four independent system operators:

The data backfill process includes the following steps:

Load Libraries and Functions

library(dplyr)
library(EIAapi)
library(jsonlite)
library(gt)
library(plotly)
library(modeltime)
source("../pipeline/eia_data.R")
source("../pipeline/backtesting.R")
meta_json <- read_json(path = "../settings/settings.json")
s <- meta_json$series
series <- lapply(1:length(s), function(i) {
    subba_id <- NULL
    # subba_id <- s[[i]]$subba_id

    if (!is.na(as.numeric(s[[i]]$subba_id))) {
        subba_id <- as.numeric(s[[i]]$subba_id)
    } else {
        subba_id <- s[[i]]$subba_id
    }

    return(data.frame(
        parent_id = s[[i]]$parent_id,
        parent_name = s[[i]]$parent_name,
        subba_id = subba_id,
        subba_name = s[[i]]$subba_name
    ))
}) |>
    bind_rows()

api_path <- meta_json$api_path
meta_path <- meta_json$meta_path
data_path <- meta_json$data_path
forecast_path <- meta_json$forecast_path
forecast_log_path <- meta_json$forecast_log_path
calibrated_models_path <- meta_json$calibrated_models_path
h <- meta_json$backtesting$h
lags <- meta_json$backtesting$features$lags |> unlist()
train_length <- meta_json$train_length
facets_template <- list(
    parent = NULL,
    subba = NULL
)

start <- as.POSIXct(paste(
    paste(
        meta_json$start$year,
        meta_json$start$month,
        meta_json$start$day,
        sep = "-"
    ),
    " ",
    meta_json$start$hour,
    ":00:00",
    sep = ""
))


end <- as.POSIXct(paste(
    paste(
        meta_json$end$year,
        meta_json$end$month,
        meta_json$end$day,
        sep = "-"
    ),
    " ",
    meta_json$end$hour,
    ":00:00",
    sep = ""
))


# start <- as.POSIXct("2024-05-18 08:00:00")

# end <- as.POSIXct("2024-06-01 01:00:00")
attr(start, "tzone") <- "UTC"
attr(end, "tzone") <- "UTC"

offset <- 2000

eia_api_key <- Sys.getenv("EIA_API_KEY")
metadata <- eia_metadata(api_key = eia_api_key, api_path = api_path)
Warning: input string 'The api_key argument is missing... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_key argument is not valid... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_path argument is missing... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_path argument is not valid, must be a character object ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'Could not pull the metadata... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'Could not parse the metadata JSON... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_key argument is missing... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_key argument is not valid... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_path argument is missing... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_path argument is not valid, must be a character object ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'Could not pull the metadata... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'Could not parse the metadata JSON... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
print(names(metadata))
 [1] "id"                "name"              "description"      
 [4] "frequency"         "facets"            "data"             
 [7] "startPeriod"       "endPeriod"         "defaultDateFormat"
[10] "defaultFrequency"  "command"          
print(metadata$startPeriod)
[1] "2018-06-19T05"
print(metadata$endPeriod)
[1] "2024-07-10T07"
meta <- NULL
data <- NULL
for (i in 1:nrow(series)) {
    facets <- facets_template
    facets["parent"] <- series[i, "parent_id"]
    facets["subba"] <- series[i, "subba_id"]
    print(facets)

    temp <- eia_backfill(
        start = start,
        end = end,
        offset = offset,
        api_key = eia_api_key,
        api_path = paste(api_path, "data", sep = ""),
        facets = facets
    )
    index <- seq.POSIXt(from = start, to = end, by = "hour")
    ts_obj <- data.frame(period = index, subba = series[i, "subba_id"]) |>
        left_join(temp, by = c("period" = "time", "subba"))

    # Impute missing values
    nas <- which(is.na(ts_obj$value))

    ts_obj$type <- ifelse(is.na(ts_obj$value), "impute", "actual")

    for (l in nas) {
        if (l > 48) {
            ts_obj$value[l] <- (ts_obj$value[l - 24] + ts_obj$value[l - 48]) / 2
        }
    }

    meta_temp <- create_metadata(data = ts_obj, start = start, end = end, type = "backfill")
    meta_temp$index <- 1
    meta_df <- as.data.frame(meta_temp)

    meta <- rbind(meta, meta_df)
    data <- rbind(data, ts_obj)
}
$parent
[1] "CISO"

$subba
[1] "PGAE"

$parent
[1] "CISO"

$subba
[1] "SCE"

$parent
[1] "CISO"

$subba
[1] "SDGE"

$parent
[1] "CISO"

$subba
[1] "VEA"
print(meta)
  index parent subba                time               start
1     1   CISO  PGAE 2024-07-11 02:01:31 2018-07-01 08:00:00
2     1   CISO   SCE 2024-07-11 02:02:01 2018-07-01 08:00:00
3     1   CISO  SDGE 2024-07-11 02:02:37 2018-07-01 08:00:00
4     1   CISO   VEA 2024-07-11 02:03:16 2018-07-01 08:00:00
                  end           start_act             end_act start_match
1 2024-06-28 01:00:00 2018-07-01 08:00:00 2024-06-28 01:00:00        TRUE
2 2024-06-28 01:00:00 2018-07-01 08:00:00 2024-06-28 01:00:00        TRUE
3 2024-06-28 01:00:00 2018-07-01 08:00:00 2024-06-28 01:00:00        TRUE
4 2024-06-28 01:00:00 2018-07-01 08:00:00 2024-06-28 01:00:00        TRUE
  end_match n_obs na     type update success comments
1      TRUE 52530  0 backfill  FALSE   FALSE         
2      TRUE 52530  0 backfill  FALSE   FALSE         
3      TRUE 52530  0 backfill  FALSE   FALSE         
4      TRUE 52530  0 backfill  FALSE   FALSE         
# The initial pull has some missing values
head(data)
               period subba               subba_name parent
1 2018-07-01 08:00:00  PGAE Pacific Gas and Electric   CISO
2 2018-07-01 09:00:00  PGAE Pacific Gas and Electric   CISO
3 2018-07-01 10:00:00  PGAE Pacific Gas and Electric   CISO
4 2018-07-01 11:00:00  PGAE Pacific Gas and Electric   CISO
5 2018-07-01 12:00:00  PGAE Pacific Gas and Electric   CISO
6 2018-07-01 13:00:00  PGAE Pacific Gas and Electric   CISO
                             parent_name value   value_units   type
1 California Independent System Operator 12522 megawatthours actual
2 California Independent System Operator 11745 megawatthours actual
3 California Independent System Operator 11200 megawatthours actual
4 California Independent System Operator 10822 megawatthours actual
5 California Independent System Operator 10644 megawatthours actual
6 California Independent System Operator 10559 megawatthours actual
# Save the data
d <- append_data(data_path = data_path, new_data = data, init = TRUE, save = TRUE)
[1] "Initial data pull"
[1] "Save the data to CSV file"
# Save the metadata
meta["success"] <- TRUE
meta["update"] <- TRUE
m <- append_metadata(meta_path = meta_path, new_meta = meta, init = TRUE, save = TRUE)
[1] "Saving the metadata file"

Plot the Series

We will use Plotly to visualize the series:

end <- lubridate::floor_date((max(data$period)), unit = "day")
d <- data |>
    arrange(subba, period) |>
    dplyr::filter(period < end)

d$subba <- as.character(d$subba)

p <- plot_ly(d, x = ~period, y = ~value, color = ~subba, type = "scatter", mode = "lines")

p

Create an Inital Forecast

head(d)
               period subba               subba_name parent
1 2018-07-01 08:00:00  PGAE Pacific Gas and Electric   CISO
2 2018-07-01 09:00:00  PGAE Pacific Gas and Electric   CISO
3 2018-07-01 10:00:00  PGAE Pacific Gas and Electric   CISO
4 2018-07-01 11:00:00  PGAE Pacific Gas and Electric   CISO
5 2018-07-01 12:00:00  PGAE Pacific Gas and Electric   CISO
6 2018-07-01 13:00:00  PGAE Pacific Gas and Electric   CISO
                             parent_name value   value_units   type
1 California Independent System Operator 12522 megawatthours actual
2 California Independent System Operator 11745 megawatthours actual
3 California Independent System Operator 11200 megawatthours actual
4 California Independent System Operator 10822 megawatthours actual
5 California Independent System Operator 10644 megawatthours actual
6 California Independent System Operator 10559 megawatthours actual
fc <- NULL
calibrated_models <- readRDS(calibrated_models_path)
fc <- create_forecast_subba(
    input = d,
    selected_models = calibrated_models,
    h = h,
    index = "period",
    var = "value",
    lags = lags,
    seasonal = TRUE,
    trend = TRUE
)
Add lag: 24
Add lag: 25
Add lag: 26
Add lag: 27
Add lag: 28
Add lag: 48
Add lag: 72
Add lag: 8760
Add lag: 24
Add lag: 25
Add lag: 26
Add lag: 27
Add lag: 28
Add lag: 48
Add lag: 72
Add lag: 8760
Add lag: 24
Add lag: 25
Add lag: 26
Add lag: 27
Add lag: 28
Add lag: 48
Add lag: 72
Add lag: 8760
Add lag: 24
Add lag: 25
Add lag: 26
Add lag: 27
Add lag: 28
Add lag: 48
Add lag: 72
Add lag: 8760
if (!is.null(fc)) {
    create_forecast_log(forecast = fc, forecast_log_path = forecast_log_path, h = h, init = TRUE, save = TRUE)

    save_forecast(forecast = fc, forecast_path = forecast_path, init = TRUE, save = TRUE)

    p <- plot_forecast(
        input = d,
        forecast = fc,
        hours = 24 * 4,
        index = "period",
        var = "value"
    )

    p
}
Initialize the forecast file
Save the forecast to ../data/forecast.csv